import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
import importlib
sys.path.insert(0, '../')
import general_utils as gen_ut
sys.path.insert(0, '../3_Hashtag_study')
import hashtag_util as ut_ht
dfUsers = pd.read_csv('../tweets.csv',usecols=['user_screen_name','user_code_error'])
dfUsers = dfUsers.groupby('user_screen_name').first()
dfUsers
| user_code_error | |
|---|---|
| user_screen_name | |
| 00000o0OOOO00 | NaN |
| 0000to2211 | NaN |
| 000Rizzo | NaN |
| 000oro000 | NaN |
| 000qwerty777 | NaN |
| ... | ... |
| zzpietro | NaN |
| zzuliram | NaN |
| zzurro65 | NaN |
| zzxang86 | NaN |
| zzzgrazia | NaN |
305259 rows × 1 columns
def df_preparation(cols,filename='../tweets.csv'):
columns = list(set(cols + ['id','user_screen_name','rt_user_screen_name','in_reply_to_screen_name']))
df = pd.read_csv(filename,low_memory=False, usecols=columns)
dfSuspect = pd.DataFrame()
for i,r in dfUsers.iterrows():
if r['user_code_error'] == 63:
df1 = df.loc[df['user_screen_name'] == i,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
df1 = df.loc[df['rt_user_screen_name'] == i,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)
'''df1 = df.loc[df['in_reply_to_screen_name'] == i,:]
dfSuspect = pd.concat([df1,dfSuspect],ignore_index=True)'''
del df1
dfSuspect.drop_duplicates(subset=['id'])
return dfSuspect
df = df_preparation(['urls','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
df
| id | created_at | user_screen_name | in_reply_to_screen_name | rt_user_screen_name | urls | |
|---|---|---|---|---|---|---|
| 0 | 1390042271047892994 | 2021-05-05 20:34:37+00:00 | zuoaiyppingtai9 | NaN | lefrasidiosho | [] |
| 1 | 1389964190056652807 | 2021-05-05 15:24:21+00:00 | zuoaiyppingtai6 | NaN | NaN | [] |
| 2 | 1343166961283424256 | 2020-12-27 12:08:53+00:00 | zarathusta78 | GiuseppeConteIT | NaN | [] |
| 3 | 1241982989560680448 | 2020-03-23 06:59:54+00:00 | zarathusta78 | lorenzoligato | NaN | [] |
| 4 | 1252567610232799232 | 2020-04-21 11:59:24+00:00 | zarathusta78 | tiecolino | NaN | [] |
| ... | ... | ... | ... | ... | ... | ... |
| 22566 | 1381256198951931905 | 2021-04-11 14:41:54+00:00 | 5v81nC7Rpt1m1KQ | NaN | NaN | [{'url': 'https://t.co/HZunGHTOaz', 'expanded_... |
| 22567 | 1382667996318076932 | 2021-04-15 12:11:53+00:00 | 5v81nC7Rpt1m1KQ | NaN | NaN | [] |
| 22568 | 1382669671850196996 | 2021-04-15 12:18:33+00:00 | 5v81nC7Rpt1m1KQ | NaN | NaN | [{'url': 'https://t.co/OhuHaeNoKA', 'expanded_... |
| 22569 | 1382670285510414337 | 2021-04-15 12:20:59+00:00 | 5v81nC7Rpt1m1KQ | NaN | NaN | [{'url': 'https://t.co/VHmu2K21w8', 'expanded_... |
| 22570 | 1370408215784038401 | 2021-03-12 16:15:54+00:00 | 0mhOdXnY9rBer9s | NaN | You3_JP | [] |
22571 rows × 6 columns
#Creating a map of all urls with the number of uses
listUrls = []
for s in df['urls']:
urls = gen_ut.get_string_json(s,'display_url')
for url in urls:
if url:
url = url.split("//")
url = url[0].split("/")
listUrls.append(url[0])
dfUrls = pd.DataFrame()
dfUrls['url'] = listUrls
dfUrls['count'] = 1
dfUrls = dfUrls.groupby('url').sum()
dfUrls.sort_values(['count'], axis = 0,inplace=True,ascending=False)
dfUrls
| count | |
|---|---|
| url | |
| twitter.com | 2100 |
| ift.tt | 228 |
| imolaoggi.it | 105 |
| ansa.it | 92 |
| adnkronos.com | 64 |
| ... | ... |
| ilmanifesto.it | 1 |
| ilgiorno.it | 1 |
| ilgiornaledellasera.blogspot.com | 1 |
| ilfat.to | 1 |
| zwebtv.com | 1 |
527 rows × 1 columns
n = 20
fig = px.histogram(dfUrls.head(n),y=dfUrls.head(n).index,x='count',
title="The most %d url used in the tweets"% n,orientation = 'h')
fig.update_yaxes(title='URL name')
fig.show()
#Creation of a dictionary of num of use per date
my_dict = {"url":[],"date":[], "count":[]};
for i in range(len(df)):
s = df.loc[i,'urls']
d = df.loc[i,'created_at']
url = gen_ut.get_string_json(s,'display_url')
if url:
url = url[0].split("//")
url = url[0].split("/")
my_dict["url"].append(url[0])
my_dict["date"].append(d)
my_dict["count"].append(1)
dfUseUrl = pd.DataFrame.from_dict(my_dict)
dfUseUrl['Week/Year'] = dfUseUrl['date'].apply(lambda x: "%d-%d" % (x.isocalendar()[1] , x.isocalendar()[0]))
dfUseUrl.drop(['date'], axis=1,inplace=True)
dfUseUrl = dfUseUrl.groupby(['Week/Year', 'url']).sum()
dfUseUrl.reset_index(inplace=True)
dfUseUrl['Week/Year'] =pd.to_datetime(dfUseUrl['Week/Year']+ '-1', format="%W-%Y-%w")
dfUseUrl.sort_values(['Week/Year'],axis = 0,inplace=True,ascending=True)
dfUseUrl
| Week/Year | url | count | |
|---|---|---|---|
| 724 | 2020-01-13 | twitter.com | 1 |
| 1194 | 2020-02-10 | twitter.com | 1 |
| 1193 | 2020-02-10 | open.online | 1 |
| 1229 | 2020-02-17 | chng.it | 1 |
| 1306 | 2020-03-02 | twitter.com | 4 |
| ... | ... | ... | ... |
| 745 | 2021-05-17 | byoblu.com | 2 |
| 744 | 2021-05-17 | ansa.it | 1 |
| 743 | 2021-05-17 | a.msn.com | 1 |
| 758 | 2021-05-17 | ilriformista.it | 1 |
| 746 | 2021-05-17 | corriere.it | 1 |
1327 rows × 3 columns
#All in the same graphic
fig = go.Figure()
for w in dfUrls.head().index:
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title='All url history use',xaxis_title='Date',yaxis_title='use count')
fig.show()
# All in different graphic
for w in dfUrls.head().index:
fig = go.Figure()
mask = dfUseUrl['url'] == w
fig.add_trace(go.Scatter(x=dfUseUrl.loc[mask,'Week/Year'], y=dfUseUrl.loc[mask,'count'],
mode='lines+markers',
name=w))
fig.update_layout(title="History use of url '%s'"%w,xaxis_title='Date',yaxis_title='use count')
fig.show()
df = df_preparation(['hashtags','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
#Creating a map of all hashtags with the number of uses
listHashtags = []
for s in df['hashtags']:
[ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0
dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfHashtags
| count | |
|---|---|
| hashtags | |
| vaccino | 669 |
| vaccini | 333 |
| AstraZeneca | 293 |
| vaccinoCovid | 247 |
| COVID19 | 245 |
| ... | ... |
| PianoVaccini | 1 |
| Piano | 1 |
| Piaggio | 1 |
| PiL | 1 |
| 超限生物武器 | 1 |
2061 rows × 1 columns
importlib.reload(ut_ht)
dfUse = ut_ht.process_dfUse(df)
dfUse
| Week/Year | hashtag | count | |
|---|---|---|---|
| 489 | 2020-01-27 | ValentinaNappi | 1 |
| 490 | 2020-01-27 | coronarvirus | 1 |
| 491 | 2020-01-27 | leminchiate | 1 |
| 2113 | 2020-02-03 | Coranavirus | 1 |
| 3016 | 2020-02-10 | coronavirus | 1 |
| ... | ... | ... | ... |
| 4587 | 2021-05-17 | nogreenpass | 1 |
| 4588 | 2021-05-17 | somari | 1 |
| 4589 | 2021-05-17 | stronzate | 1 |
| 4577 | 2021-05-17 | Pfizer | 1 |
| 4579 | 2021-05-17 | VACCINATI | 3 |
5918 rows × 3 columns
ut_ht.visual_histogram(dfHashtags,30)
sum(dfHashtags['count'])
9873
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
'.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)
ut_ht.visual_histogram(dfMoreFiltered,20)
ut_ht.visual_by_date_together(dfMoreFiltered,dfUse)
ut_ht.visual_by_date_split(dfMoreFiltered,dfUse)
df = df_preparation(['is_self_rt'])
dfSelf = df.groupby('user_screen_name').sum()
dfSelf['all_rt'] = df.groupby('user_screen_name').count().iloc[:,0]
dfSelf.sort_values(['all_rt'],inplace=True)
dfSelf
| id | is_self_rt | all_rt | |
|---|---|---|---|
| user_screen_name | |||
| 00000o0OOOO00 | 1.343216e+18 | 0 | 1 |
| beppa_giosef | 1.362789e+18 | 0 | 1 |
| beretta_gio | 1.366367e+18 | 0 | 1 |
| bernisilvy | 1.338372e+18 | 0 | 1 |
| beverlytozier3 | 1.390300e+18 | 0 | 1 |
| ... | ... | ... | ... |
| danielsun2021mi | 1.049776e+21 | 0 | 762 |
| AlexTheMod | 1.162715e+21 | 0 | 857 |
| OverlookHotel71 | 1.176089e+21 | 0 | 874 |
| venetolink | 1.246664e+21 | 10 | 917 |
| d_essere | 3.607896e+21 | 2 | 2641 |
3290 rows × 3 columns
n = 20
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['all_rt'],orientation='h', name = 'All retweet'), row=1, col=1)
fig.add_trace(go.Bar(y=dfSelf.tail(n).index, x=dfSelf.tail(n)['is_self_rt'],orientation='h', name = 'Self retweet'), row=1, col=1)
fig.update_layout(title="How many retweet are self retweet (the most 20 retweeter)")
fig.update_xaxes(title="Count of retweets")
fig.update_yaxes(title="Username")
fig.update_layout(
barmode="overlay",
bargap=0.1)
fig.show()
df = df_preparation(['is_self_rt'])
df
| id | user_screen_name | in_reply_to_screen_name | rt_user_screen_name | is_self_rt | |
|---|---|---|---|---|---|
| 0 | 1390042271047892994 | zuoaiyppingtai9 | NaN | lefrasidiosho | False |
| 1 | 1389964190056652807 | zuoaiyppingtai6 | NaN | NaN | False |
| 2 | 1343166961283424256 | zarathusta78 | GiuseppeConteIT | NaN | False |
| 3 | 1241982989560680448 | zarathusta78 | lorenzoligato | NaN | False |
| 4 | 1252567610232799232 | zarathusta78 | tiecolino | NaN | False |
| ... | ... | ... | ... | ... | ... |
| 22566 | 1381256198951931905 | 5v81nC7Rpt1m1KQ | NaN | NaN | False |
| 22567 | 1382667996318076932 | 5v81nC7Rpt1m1KQ | NaN | NaN | False |
| 22568 | 1382669671850196996 | 5v81nC7Rpt1m1KQ | NaN | NaN | False |
| 22569 | 1382670285510414337 | 5v81nC7Rpt1m1KQ | NaN | NaN | False |
| 22570 | 1370408215784038401 | 0mhOdXnY9rBer9s | NaN | You3_JP | False |
22571 rows × 5 columns
dfRetweet = df.dropna(subset=['rt_user_screen_name']).copy()
dfRetweet.drop(columns=['in_reply_to_screen_name'],inplace=True,errors='ignore')
dfRetweet = dfRetweet.groupby('rt_user_screen_name').count()
dfRetweet.rename(columns={'user_screen_name':'all_rt'},inplace=True,errors='ignore')
dfRetweet['self_rt'] = df.dropna(subset=['rt_user_screen_name'
]).copy().groupby('rt_user_screen_name').sum().loc[:,'is_self_rt']
dfRetweet.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfRetweet['real_rt'] = dfRetweet['all_rt'] - dfRetweet['self_rt']
dfRetweet.sort_values('real_rt',ascending=False,inplace=True)
dfRetweet
| id | all_rt | self_rt | real_rt | |
|---|---|---|---|---|
| rt_user_screen_name | ||||
| d_essere | 2252 | 2252 | 2 | 2250 |
| Cri_Giordano | 405 | 405 | 0 | 405 |
| DHofmannsthal | 411 | 411 | 6 | 405 |
| DottAngeloC | 298 | 298 | 0 | 298 |
| VezzoliNarciso | 292 | 292 | 0 | 292 |
| ... | ... | ... | ... | ... |
| PontiBenedetto | 1 | 1 | 0 | 1 |
| PontelliMichele | 1 | 1 | 0 | 1 |
| zucconiluca | 1 | 1 | 0 | 1 |
| Bastian42782535 | 14 | 14 | 14 | 0 |
| bossebasta | 2 | 2 | 2 | 0 |
2771 rows × 4 columns
n = 20
fig =px.histogram(dfRetweet.head(n),y=dfRetweet.head(n).index,x='real_rt',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users retweeted (without self retweet)"%n)
fig.show()
dfReply = df.dropna(subset=['in_reply_to_screen_name']).copy()
dfReply.drop(columns=['rt_user_screen_name'],inplace=True,errors='ignore')
dfReply.drop(columns=['is_self_rt'],inplace=True,errors='ignore')
dfReply = dfReply.groupby('in_reply_to_screen_name').count()
dfReply.rename(columns={'user_screen_name':'count'},inplace=True,errors='ignore')
dfReply.sort_values('count',ascending=False,inplace=True)
dfReply
| id | count | |
|---|---|---|
| in_reply_to_screen_name | ||
| DottAngeloC | 42 | 42 |
| MediasetTgcom24 | 41 | 41 |
| Agenzia_Ansa | 34 | 34 |
| d_essere | 29 | 29 |
| valy_s | 24 | 24 |
| ... | ... | ... |
| SucatelaAScle | 1 | 1 |
| DonatoPorreca | 1 | 1 |
| StufaMarcia1 | 1 | 1 |
| Dorayak56884206 | 1 | 1 |
| 00Zucchi | 1 | 1 |
1847 rows × 2 columns
n = 20
fig =px.histogram(dfReply.head(n),y=dfReply.head(n).index,x='count',orientation='h')
fig.update_yaxes(title='username')
fig.update_layout(title="The most %d users replied"%n)
fig.show()
df = df_preparation(['is_self_rt'])
df1 = df.copy()
df1['sum_total_posts'] = 1
df1 = df1.groupby('user_screen_name').sum()
df = df.groupby('user_screen_name').count()
df['sum_total_posts'] = df1['sum_total_posts']
df['sum_self_rt'] = df1['is_self_rt']
del df1
df.rename(columns={'in_reply_to_screen_name':'num_reply','rt_user_screen_name':'num_rt'},inplace=True,errors='ignore')
'''df['num_in_reply'] = dfReply
df.loc[df['num_in_reply'].isna(),'num_in_reply'] = 0'''
df['num_in_rt'] = dfRetweet['real_rt']
df.loc[df['num_in_rt'].isna(),'num_in_rt'] = 0
df
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_rt | |
|---|---|---|---|---|---|---|---|
| user_screen_name | |||||||
| 00000o0OOOO00 | 1 | 0 | 1 | 1 | 1.0 | 0 | 1.0 |
| 0Zedda | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| 0mhOdXnY9rBer9s | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| 0zen_ | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| 12qbert | 8 | 0 | 8 | 8 | 8.0 | 0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| zoppi_carla | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| zuoaiyppingtai6 | 1 | 0 | 0 | 1 | 1.0 | 0 | 0.0 |
| zuoaiyppingtai9 | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| zzhang12101 | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
| zziocane66 | 1 | 0 | 1 | 1 | 1.0 | 0 | 0.0 |
3290 rows × 7 columns
df.describe()
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_rt | |
|---|---|---|---|---|---|---|---|
| count | 3290.000000 | 3290.000000 | 3290.000000 | 3290.000000 | 3290.000000 | 3290.000000 | 3290.000000 |
| mean | 6.860486 | 0.982067 | 4.803040 | 6.860486 | 6.860486 | 0.060182 | 2.706991 |
| std | 60.608496 | 9.519562 | 47.386999 | 60.608496 | 60.608496 | 2.533094 | 42.268012 |
| min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
| 50% | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
| 75% | 2.000000 | 0.000000 | 2.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 |
| max | 2641.000000 | 350.000000 | 2162.000000 | 2641.000000 | 2641.000000 | 144.000000 | 2250.000000 |
df.sort_values('num_in_rt',ascending=False,inplace=True)
df = df.head(10)
df
| id | num_reply | num_rt | is_self_rt | sum_total_posts | sum_self_rt | num_in_rt | |
|---|---|---|---|---|---|---|---|
| user_screen_name | |||||||
| d_essere | 2641 | 61 | 2162 | 2641 | 2641.0 | 2 | 2250.0 |
| Cri_Giordano | 122 | 45 | 16 | 122 | 122.0 | 0 | 405.0 |
| DHofmannsthal | 58 | 11 | 18 | 58 | 58.0 | 6 | 405.0 |
| DottAngeloC | 466 | 350 | 6 | 466 | 466.0 | 0 | 298.0 |
| VezzoliNarciso | 68 | 8 | 7 | 68 | 68.0 | 0 | 292.0 |
| sumaistu47 | 292 | 2 | 0 | 292 | 292.0 | 0 | 190.0 |
| OverlookHotel71 | 874 | 161 | 567 | 874 | 874.0 | 0 | 187.0 |
| NuovoNickname | 127 | 50 | 33 | 127 | 127.0 | 0 | 159.0 |
| vincbrn69 | 76 | 41 | 0 | 76 | 76.0 | 0 | 156.0 |
| noitre32 | 10 | 0 | 10 | 10 | 10.0 | 0 | 155.0 |
n = 20
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['num_reply'],orientation='h', name = 'Reply'), row=1, col=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['num_in_rt'],orientation='h', name = 'Retweet'), row=1, col=1)
fig.add_trace(go.Bar(y=df.head(n).index, x=df.head(n)['sum_self_rt'],orientation='h', name = 'Self retweet')
, row=1, col=1)
fig.update_layout(title="How many time this users are retweeted")
fig.update_xaxes(title="Count")
fig.update_yaxes(title="Username")
fig.show()
df = df_preparation([])
dfNovax = df_preparation([],filename='../tweets_novax.csv')
dfProvax = df_preparation([],filename='../tweets_provax.csv')
print("There are %d (%d%%) novax that have been suspended from Twitter"%
(len(dfNovax),(len(dfNovax)/len(df))*100))
There are 13085 (57%) novax that have been suspended from Twitter
print("There are %d (%d%%) provax that have been suspended from Twitter"%
(len(dfProvax),(len(dfProvax)/len(df))*100))
There are 1809 (8%) provax that have been suspended from Twitter